{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2020-12-04 09:25:00--  https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt\n",
      "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n",
      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 231508 (226K) [text/plain]\n",
      "Saving to: ‘/tmp/bert-base-uncased-vocab.txt’\n",
      "\n",
      "/tmp/bert-base-unca 100%[===================>] 226.08K  --.-KB/s    in 0.06s   \n",
      "\n",
      "2020-12-04 09:25:00 (3.87 MB/s) - ‘/tmp/bert-base-uncased-vocab.txt’ saved [231508/231508]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -O /tmp/bert-base-uncased-vocab.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import BertWordPieceTokenizer\n",
    "from tokenizers.tools import EncodingVisualizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "EncodingVisualizer.unk_token_regex.search(\"aaa[udsnk]aaa\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"Mathias Bynens 'Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞': Whenever you’re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (💩) in a string, 💩💩💩💩💩💩💩💩💩💩💩💩 and see if anything breaks. It’s a quick, fun, and easy way to see if your code supports astral symbols. Once you’ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = BertWordPieceTokenizer(\"/tmp/bert-base-uncased-vocab.txt\", lowercase=True)\n",
    "visualizer = EncodingVisualizer(tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualizing Tokens With No Annotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <html>\n",
       "        <head>\n",
       "            <style>\n",
       "                .tokenized-text {\n",
       "    width:100%;\n",
       "    padding:2rem;\n",
       "    max-height: 400px;\n",
       "    overflow-y: auto;\n",
       "    box-sizing:border-box;\n",
       "    line-height:4rem; /* Lots of space between lines */\n",
       "    font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
       "    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
       "    background-color: rgba(0,0,0,0.01);\n",
       "    letter-spacing:2px; /* Give some extra separation between chars */\n",
       "}\n",
       ".non-token{\n",
       "    /* White space and other things the tokenizer ignores*/\n",
       "    white-space: pre;\n",
       "    letter-spacing:4px;\n",
       "    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
       "    border-bottom:1px solid #A0A0A0;\n",
       "    line-height: 1rem;\n",
       "    height: calc(100% - 2px);\n",
       "}\n",
       "\n",
       ".token {\n",
       "    white-space: pre;\n",
       "    position:relative;\n",
       "    color:black;\n",
       "    letter-spacing:2px;\n",
       "}\n",
       "\n",
       ".annotation{\n",
       "    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
       "    border-radius:4px;\n",
       "    position:relative;\n",
       "    width:fit-content;\n",
       "}\n",
       ".annotation:before {\n",
       "    /*The before holds the text and the after holds the background*/\n",
       "    z-index:1000; /* Make sure this is above the background */\n",
       "    content:attr(data-label); /* The annotations label is on a data attribute */\n",
       "    color:white;\n",
       "    position:absolute;\n",
       "    font-size:1rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    left:0;\n",
       "    width:100%;\n",
       "    padding:0.5rem 0;\n",
       "    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "    text-overflow:ellipsis;\n",
       "}\n",
       "\n",
       ".annotation:after {\n",
       "    content:attr(data-label); /* The content defines the width of the annotation*/\n",
       "    position:absolute;\n",
       "    font-size:0.75rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "    text-overflow:ellipsis;\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "\n",
       "    left:0;\n",
       "    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
       "\n",
       "    padding:0.5rem 0;\n",
       "    /* Nast hack below:\n",
       "    We set the annotations color in code because we don't know the colors at css time.\n",
       "    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
       "    So to get around that, annotations have the color set on them with a style attribute and then we\n",
       "    can get the color with currentColor.\n",
       "    Annotations wrap tokens and tokens set the color back to black\n",
       "     */\n",
       "    background-color: currentColor;\n",
       "}\n",
       ".annotation:hover::after, .annotation:hover::before{\n",
       "    /* When the user hovers over an annotation expand the label to display in full\n",
       "     */\n",
       "    min-width: fit-content;\n",
       "}\n",
       "\n",
       ".annotation:hover{\n",
       "    /* Emphasize the annotation start end with a border on hover*/\n",
       "    border-color: currentColor;\n",
       "    border: 2px solid;\n",
       "}\n",
       ".special-token:not(:empty){\n",
       "    /*\n",
       "    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
       "     */\n",
       "    position:relative;\n",
       "}\n",
       ".special-token:empty::before{\n",
       "    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
       "    content:attr(data-stok);\n",
       "    background:#202020;\n",
       "    font-size:0.75rem;\n",
       "    color:white;\n",
       "    margin: 0 0.25rem;\n",
       "    padding: 0.25rem;\n",
       "    border-radius:4px\n",
       "}\n",
       "\n",
       ".special-token:not(:empty):before {\n",
       "    /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
       "    content:attr(data-stok);\n",
       "    position:absolute;\n",
       "    bottom:1.75rem;\n",
       "    min-width:100%;\n",
       "    width:100%;\n",
       "    height:1rem;\n",
       "    line-height:1rem;\n",
       "    font-size:1rem;\n",
       "    text-align:center;\n",
       "    color:white;\n",
       "    font-weight:bold;\n",
       "    background:#202020;\n",
       "    border-radius:10%;\n",
       "}\n",
       "/*\n",
       "We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
       "instead we apply even and odd class at generation time and color them that way\n",
       " */\n",
       ".even-token{\n",
       "    background:#DCDCDC\t;\n",
       "    border: 1px solid #DCDCDC;\n",
       "}\n",
       ".odd-token{\n",
       "    background:#A0A0A0;\n",
       "    border: 1px solid #A0A0A0;\n",
       "}\n",
       ".even-token.multi-token,.odd-token.multi-token{\n",
       "    background:  repeating-linear-gradient(\n",
       "    45deg,\n",
       "    transparent,\n",
       "    transparent 1px,\n",
       "    #ccc 1px,\n",
       "    #ccc 1px\n",
       "    ),\n",
       "    /* on \"bottom\" */\n",
       "    linear-gradient(\n",
       "    to bottom,\n",
       "    #FFB6C1,\n",
       "    #999\n",
       "    );\n",
       "}\n",
       "\n",
       ".multi-token:hover::after {\n",
       "    content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
       "    color:white;\n",
       "    background-color: black;\n",
       "    position:absolute;\n",
       "    font-size:0.75rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "    text-overflow:ellipsis;\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "    left:0;\n",
       "    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
       "    padding:0.5rem 0;\n",
       "}\n",
       "\n",
       "            </style>\n",
       "        </head>\n",
       "        <body>\n",
       "            <div class=\"tokenized-text\" dir=auto>\n",
       "            <span class=\"token odd-token\"  >Mathias</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >By</span><span class=\"token odd-token\"  >nen</span><span class=\"token even-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >'</span><span class=\"token even-token\"  >Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A</span><span class=\"non-token\"  >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\"  >L</span><span class=\"non-token\"  >̠ͨͧͩ͘</span><span class=\"token even-token\"  >G̴̻͈͍͔̹̑͗̎̅͛́O</span><span class=\"non-token\"  >̵̨̹̻̝̳͂̌̌͘</span><span class=\"token odd-token\"  >!</span><span class=\"non-token\"  >͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞</span><span class=\"token even-token\"  >'</span><span class=\"token odd-token\"  >:</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Whenever</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >re</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >working</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >on</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >piece</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >Java</span><span class=\"token even-token\"  >Script</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >deals</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >with</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >strings</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >or</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >regular</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >expressions</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >some</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >just</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >add</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >unit</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >test</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >contains</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >pile</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >po</span><span class=\"token even-token\"  >o</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >(</span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\"  >)</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >string</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >anything</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >breaks</span><span class=\"token even-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >It</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >quick</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >fun</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >easy</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >supports</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >as</span><span class=\"token odd-token\"  >tral</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >symbols</span><span class=\"token odd-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Once</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >ve</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >found</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Unicode</span><span class=\"token odd-token\"  >-</span><span class=\"token even-token\"  >related</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >bug</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >all</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >need</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >do</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >is</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >apply</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >the</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >techniques</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >discussed</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >this</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >post</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >fix</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >it</span><span class=\"token even-token\"  >.</span>\n",
       "            </div>\n",
       "        </body>\n",
       "    </html>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualizer(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualizing Tokens With Aligned Annotations\n",
    "First we make some annotations with the Annotation class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers.tools import Annotation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "anno1 = Annotation(start=0, end=2, label=\"foo\")\n",
    "anno2 = Annotation(start=2, end=4, label=\"bar\")\n",
    "anno3 = Annotation(start=6, end=8, label=\"poo\")\n",
    "anno4 = Annotation(start=9, end=12, label=\"shoe\")\n",
    "annotations=[\n",
    "    anno1,\n",
    "    anno2,\n",
    "    anno3,\n",
    "    anno4,\n",
    "    Annotation(start=23, end=30, label=\"random tandem bandem sandem landem fandom\"),\n",
    "    Annotation(start=63, end=70, label=\"foo\"),\n",
    "    Annotation(start=80, end=95, label=\"bar\"),\n",
    "    Annotation(start=120, end=128, label=\"bar\"),\n",
    "    Annotation(start=152, end=155, label=\"poo\"),\n",
    "]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <html>\n",
       "        <head>\n",
       "            <style>\n",
       "                .tokenized-text {\n",
       "    width:100%;\n",
       "    padding:2rem;\n",
       "    max-height: 400px;\n",
       "    overflow-y: auto;\n",
       "    box-sizing:border-box;\n",
       "    line-height:4rem; /* Lots of space between lines */\n",
       "    font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
       "    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
       "    background-color: rgba(0,0,0,0.01);\n",
       "    letter-spacing:2px; /* Give some extra separation between chars */\n",
       "}\n",
       ".non-token{\n",
       "    /* White space and other things the tokenizer ignores*/\n",
       "    white-space: pre;\n",
       "    letter-spacing:4px;\n",
       "    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
       "    border-bottom:1px solid #A0A0A0;\n",
       "    line-height: 1rem;\n",
       "    height: calc(100% - 2px);\n",
       "}\n",
       "\n",
       ".token {\n",
       "    white-space: pre;\n",
       "    position:relative;\n",
       "    color:black;\n",
       "    letter-spacing:2px;\n",
       "}\n",
       "\n",
       ".annotation{\n",
       "    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
       "    border-radius:4px;\n",
       "    position:relative;\n",
       "    width:fit-content;\n",
       "}\n",
       ".annotation:before {\n",
       "    /*The before holds the text and the after holds the background*/\n",
       "    z-index:1000; /* Make sure this is above the background */\n",
       "    content:attr(data-label); /* The annotations label is on a data attribute */\n",
       "    color:white;\n",
       "    position:absolute;\n",
       "    font-size:1rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    left:0;\n",
       "    width:100%;\n",
       "    padding:0.5rem 0;\n",
       "    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "    text-overflow:ellipsis;\n",
       "}\n",
       "\n",
       ".annotation:after {\n",
       "    content:attr(data-label); /* The content defines the width of the annotation*/\n",
       "    position:absolute;\n",
       "    font-size:0.75rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "    text-overflow:ellipsis;\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "\n",
       "    left:0;\n",
       "    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
       "\n",
       "    padding:0.5rem 0;\n",
       "    /* Nast hack below:\n",
       "    We set the annotations color in code because we don't know the colors at css time.\n",
       "    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
       "    So to get around that, annotations have the color set on them with a style attribute and then we\n",
       "    can get the color with currentColor.\n",
       "    Annotations wrap tokens and tokens set the color back to black\n",
       "     */\n",
       "    background-color: currentColor;\n",
       "}\n",
       ".annotation:hover::after, .annotation:hover::before{\n",
       "    /* When the user hovers over an annotation expand the label to display in full\n",
       "     */\n",
       "    min-width: fit-content;\n",
       "}\n",
       "\n",
       ".annotation:hover{\n",
       "    /* Emphasize the annotation start end with a border on hover*/\n",
       "    border-color: currentColor;\n",
       "    border: 2px solid;\n",
       "}\n",
       ".special-token:not(:empty){\n",
       "    /*\n",
       "    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
       "     */\n",
       "    position:relative;\n",
       "}\n",
       ".special-token:empty::before{\n",
       "    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
       "    content:attr(data-stok);\n",
       "    background:#202020;\n",
       "    font-size:0.75rem;\n",
       "    color:white;\n",
       "    margin: 0 0.25rem;\n",
       "    padding: 0.25rem;\n",
       "    border-radius:4px\n",
       "}\n",
       "\n",
       ".special-token:not(:empty):before {\n",
       "    /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
       "    content:attr(data-stok);\n",
       "    position:absolute;\n",
       "    bottom:1.75rem;\n",
       "    min-width:100%;\n",
       "    width:100%;\n",
       "    height:1rem;\n",
       "    line-height:1rem;\n",
       "    font-size:1rem;\n",
       "    text-align:center;\n",
       "    color:white;\n",
       "    font-weight:bold;\n",
       "    background:#202020;\n",
       "    border-radius:10%;\n",
       "}\n",
       "/*\n",
       "We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
       "instead we apply even and odd class at generation time and color them that way\n",
       " */\n",
       ".even-token{\n",
       "    background:#DCDCDC\t;\n",
       "    border: 1px solid #DCDCDC;\n",
       "}\n",
       ".odd-token{\n",
       "    background:#A0A0A0;\n",
       "    border: 1px solid #A0A0A0;\n",
       "}\n",
       ".even-token.multi-token,.odd-token.multi-token{\n",
       "    background:  repeating-linear-gradient(\n",
       "    45deg,\n",
       "    transparent,\n",
       "    transparent 1px,\n",
       "    #ccc 1px,\n",
       "    #ccc 1px\n",
       "    ),\n",
       "    /* on \"bottom\" */\n",
       "    linear-gradient(\n",
       "    to bottom,\n",
       "    #FFB6C1,\n",
       "    #999\n",
       "    );\n",
       "}\n",
       "\n",
       ".multi-token:hover::after {\n",
       "    content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
       "    color:white;\n",
       "    background-color: black;\n",
       "    position:absolute;\n",
       "    font-size:0.75rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "    text-overflow:ellipsis;\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "    left:0;\n",
       "    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
       "    padding:0.5rem 0;\n",
       "}\n",
       "\n",
       "            </style>\n",
       "        </head>\n",
       "        <body>\n",
       "            <div class=\"tokenized-text\" dir=auto>\n",
       "            <span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token odd-token\"  >Ma</span></span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token odd-token\"  >th</span></span><span class=\"token odd-token\"  >ia</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token odd-token\"  >s</span><span class=\"non-token\"  > </span></span><span class=\"token even-token\"  >B</span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"shoe\"><span class=\"token even-token\"  >y</span><span class=\"token odd-token\"  >ne</span></span><span class=\"token odd-token\"  >n</span><span class=\"token even-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >'</span><span class=\"token even-token\"  >Z͑ͫ̓ͪ̂ͫ</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"random tandem bandem sandem landem fandom\"><span class=\"token even-token\"  >̽͏̴̙̤̞͉</span></span><span class=\"token even-token\"  >͚̯̞̠͍A</span><span class=\"non-token\"  >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\"  >L</span><span class=\"non-token\"  >̠ͨͧͩ͘</span><span class=\"token even-token\"  >G̴̻͈͍͔̹̑͗̎̅͛́</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\"  >O</span><span class=\"non-token\"  >̵̨͂̌̌͘</span></span><span class=\"non-token\"  >̹̻̝̳</span><span class=\"token odd-token\"  >!</span><span class=\"non-token\"  >̿̋ͥͥ̂</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"non-token\"  >͖̬̰̙̗ͣ̐́́͜͞</span><span class=\"token even-token\"  >'</span><span class=\"token odd-token\"  >:</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >W</span></span><span class=\"token even-token\"  >henever</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >re</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >working</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >on</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >piece</span></span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >Java</span><span class=\"token even-token\"  >Script</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >de</span></span><span class=\"token odd-token\"  >als</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >with</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >strings</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >or</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >regular</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >expressions</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >some</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >just</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >add</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >unit</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >test</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >contains</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >pile</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >po</span><span class=\"token even-token\"  >o</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >(</span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\"  >)</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >string</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >anything</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >breaks</span><span class=\"token even-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >It</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >quick</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >fun</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >easy</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >supports</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >as</span><span class=\"token odd-token\"  >tral</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >symbols</span><span class=\"token odd-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Once</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >ve</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >found</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Unicode</span><span class=\"token odd-token\"  >-</span><span class=\"token even-token\"  >related</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >bug</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >all</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >need</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >do</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >is</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >apply</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >the</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >techniques</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >discussed</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >this</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >post</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >fix</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >it</span><span class=\"token even-token\"  >.</span>\n",
       "            </div>\n",
       "        </body>\n",
       "    </html>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualizer(text,annotations=annotations)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using A Custom Annotation Format\n",
    "Every system has its own representation of annotations. That's why we can instantiate the EncodingVisualizer with a convertion function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'startPlace': 0, 'endPlace': 3, 'theTag': '0'},\n",
       " {'startPlace': 4, 'endPlace': 7, 'theTag': '4'},\n",
       " {'startPlace': 8, 'endPlace': 11, 'theTag': '8'},\n",
       " {'startPlace': 12, 'endPlace': 15, 'theTag': '12'},\n",
       " {'startPlace': 16, 'endPlace': 19, 'theTag': '16'}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "funnyAnnotations = [dict(startPlace=i,endPlace=i+3,theTag=str(i)) for i in range(0,20,4)]\n",
    "funnyAnnotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "converter = lambda funny: Annotation(start=funny['startPlace'], end=funny['endPlace'], label=funny['theTag'])\n",
    "visualizer = EncodingVisualizer(tokenizer=tokenizer, default_to_notebook=True, annotation_converter=converter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <html>\n",
       "        <head>\n",
       "            <style>\n",
       "                .tokenized-text {\n",
       "    width:100%;\n",
       "    padding:2rem;\n",
       "    max-height: 400px;\n",
       "    overflow-y: auto;\n",
       "    box-sizing:border-box;\n",
       "    line-height:4rem; /* Lots of space between lines */\n",
       "    font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
       "    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
       "    background-color: rgba(0,0,0,0.01);\n",
       "    letter-spacing:2px; /* Give some extra separation between chars */\n",
       "}\n",
       ".non-token{\n",
       "    /* White space and other things the tokenizer ignores*/\n",
       "    white-space: pre;\n",
       "    letter-spacing:4px;\n",
       "    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
       "    border-bottom:1px solid #A0A0A0;\n",
       "    line-height: 1rem;\n",
       "    height: calc(100% - 2px);\n",
       "}\n",
       "\n",
       ".token {\n",
       "    white-space: pre;\n",
       "    position:relative;\n",
       "    color:black;\n",
       "    letter-spacing:2px;\n",
       "}\n",
       "\n",
       ".annotation{\n",
       "    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
       "    border-radius:4px;\n",
       "    position:relative;\n",
       "    width:fit-content;\n",
       "}\n",
       ".annotation:before {\n",
       "    /*The before holds the text and the after holds the background*/\n",
       "    z-index:1000; /* Make sure this is above the background */\n",
       "    content:attr(data-label); /* The annotations label is on a data attribute */\n",
       "    color:white;\n",
       "    position:absolute;\n",
       "    font-size:1rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    left:0;\n",
       "    width:100%;\n",
       "    padding:0.5rem 0;\n",
       "    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "    text-overflow:ellipsis;\n",
       "}\n",
       "\n",
       ".annotation:after {\n",
       "    content:attr(data-label); /* The content defines the width of the annotation*/\n",
       "    position:absolute;\n",
       "    font-size:0.75rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "    text-overflow:ellipsis;\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "\n",
       "    left:0;\n",
       "    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
       "\n",
       "    padding:0.5rem 0;\n",
       "    /* Nast hack below:\n",
       "    We set the annotations color in code because we don't know the colors at css time.\n",
       "    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
       "    So to get around that, annotations have the color set on them with a style attribute and then we\n",
       "    can get the color with currentColor.\n",
       "    Annotations wrap tokens and tokens set the color back to black\n",
       "     */\n",
       "    background-color: currentColor;\n",
       "}\n",
       ".annotation:hover::after, .annotation:hover::before{\n",
       "    /* When the user hovers over an annotation expand the label to display in full\n",
       "     */\n",
       "    min-width: fit-content;\n",
       "}\n",
       "\n",
       ".annotation:hover{\n",
       "    /* Emphasize the annotation start end with a border on hover*/\n",
       "    border-color: currentColor;\n",
       "    border: 2px solid;\n",
       "}\n",
       ".special-token:not(:empty){\n",
       "    /*\n",
       "    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
       "     */\n",
       "    position:relative;\n",
       "}\n",
       ".special-token:empty::before{\n",
       "    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
       "    content:attr(data-stok);\n",
       "    background:#202020;\n",
       "    font-size:0.75rem;\n",
       "    color:white;\n",
       "    margin: 0 0.25rem;\n",
       "    padding: 0.25rem;\n",
       "    border-radius:4px\n",
       "}\n",
       "\n",
       ".special-token:not(:empty):before {\n",
       "    /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
       "    content:attr(data-stok);\n",
       "    position:absolute;\n",
       "    bottom:1.75rem;\n",
       "    min-width:100%;\n",
       "    width:100%;\n",
       "    height:1rem;\n",
       "    line-height:1rem;\n",
       "    font-size:1rem;\n",
       "    text-align:center;\n",
       "    color:white;\n",
       "    font-weight:bold;\n",
       "    background:#202020;\n",
       "    border-radius:10%;\n",
       "}\n",
       "/*\n",
       "We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
       "instead we apply even and odd class at generation time and color them that way\n",
       " */\n",
       ".even-token{\n",
       "    background:#DCDCDC\t;\n",
       "    border: 1px solid #DCDCDC;\n",
       "}\n",
       ".odd-token{\n",
       "    background:#A0A0A0;\n",
       "    border: 1px solid #A0A0A0;\n",
       "}\n",
       ".even-token.multi-token,.odd-token.multi-token{\n",
       "    background:  repeating-linear-gradient(\n",
       "    45deg,\n",
       "    transparent,\n",
       "    transparent 1px,\n",
       "    #ccc 1px,\n",
       "    #ccc 1px\n",
       "    ),\n",
       "    /* on \"bottom\" */\n",
       "    linear-gradient(\n",
       "    to bottom,\n",
       "    #FFB6C1,\n",
       "    #999\n",
       "    );\n",
       "}\n",
       "\n",
       ".multi-token:hover::after {\n",
       "    content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
       "    color:white;\n",
       "    background-color: black;\n",
       "    position:absolute;\n",
       "    font-size:0.75rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "    text-overflow:ellipsis;\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "    left:0;\n",
       "    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
       "    padding:0.5rem 0;\n",
       "}\n",
       "\n",
       "            </style>\n",
       "        </head>\n",
       "        <body>\n",
       "            <div class=\"tokenized-text\" dir=auto>\n",
       "            <span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"0\"><span class=\"token odd-token\"  >Mat</span></span><span class=\"token odd-token\"  >h</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"4\"><span class=\"token odd-token\"  >ias</span></span><span class=\"non-token\"  > </span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"8\"><span class=\"token even-token\"  >By</span><span class=\"token odd-token\"  >n</span></span><span class=\"token odd-token\"  >e</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"12\"><span class=\"token odd-token\"  >n</span><span class=\"token even-token\"  >s</span><span class=\"non-token\"  > </span></span><span class=\"token odd-token\"  >'</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"16\"><span class=\"token even-token\"  >Z͑ͫ</span></span><span class=\"token even-token\"  >̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A</span><span class=\"non-token\"  >̴̵̜̰͔ͫ͗͢</span><span class=\"token odd-token\"  >L</span><span class=\"non-token\"  >̠ͨͧͩ͘</span><span class=\"token even-token\"  >G̴̻͈͍͔̹̑͗̎̅͛́O</span><span class=\"non-token\"  >̵̨̹̻̝̳͂̌̌͘</span><span class=\"token odd-token\"  >!</span><span class=\"non-token\"  >͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞</span><span class=\"token even-token\"  >'</span><span class=\"token odd-token\"  >:</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Whenever</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >re</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >working</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >on</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >piece</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >Java</span><span class=\"token even-token\"  >Script</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >deals</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >with</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >strings</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >or</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >regular</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >expressions</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >some</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >just</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >add</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >unit</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >test</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >that</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >contains</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >pile</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >of</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >po</span><span class=\"token even-token\"  >o</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >(</span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩</span><span class=\"token odd-token\"  >)</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >string</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token special-token\"  data-stok=\"[UNK]\" >💩💩💩💩💩💩💩💩💩💩💩💩</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >anything</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >breaks</span><span class=\"token even-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >It</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >s</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >quick</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >fun</span><span class=\"token even-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >and</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >easy</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >way</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >see</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >if</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >supports</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >as</span><span class=\"token odd-token\"  >tral</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >symbols</span><span class=\"token odd-token\"  >.</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Once</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"token even-token\"  >’</span><span class=\"token odd-token\"  >ve</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >found</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >a</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >Unicode</span><span class=\"token odd-token\"  >-</span><span class=\"token even-token\"  >related</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >bug</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >your</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >code</span><span class=\"token odd-token\"  >,</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >all</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >you</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >need</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >do</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >is</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >apply</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >the</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >techniques</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >discussed</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >in</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >this</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >post</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >to</span><span class=\"non-token\"  > </span><span class=\"token even-token\"  >fix</span><span class=\"non-token\"  > </span><span class=\"token odd-token\"  >it</span><span class=\"token even-token\"  >.</span>\n",
       "            </div>\n",
       "        </body>\n",
       "    </html>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualizer(text, annotations=funnyAnnotations)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Trying with Roberta\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2020-12-04 09:25:00--  https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\n",
      "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.226.19\n",
      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.226.19|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 898823 (878K) [application/json]\n",
      "Saving to: ‘/tmp/roberta-base-vocab.json’\n",
      "\n",
      "/tmp/roberta-base-v 100%[===================>] 877.76K  4.35MB/s    in 0.2s    \n",
      "\n",
      "2020-12-04 09:25:00 (4.35 MB/s) - ‘/tmp/roberta-base-vocab.json’ saved [898823/898823]\n",
      "\n",
      "--2020-12-04 09:25:00--  https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\n",
      "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.253\n",
      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.253|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 456318 (446K) [text/plain]\n",
      "Saving to: ‘/tmp/roberta-base-merges.txt’\n",
      "\n",
      "/tmp/roberta-base-m 100%[===================>] 445.62K  --.-KB/s    in 0.1s    \n",
      "\n",
      "2020-12-04 09:25:01 (4.04 MB/s) - ‘/tmp/roberta-base-merges.txt’ saved [456318/456318]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json\" -O /tmp/roberta-base-vocab.json\n",
    "!wget \"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt\" -O /tmp/roberta-base-merges.txt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <html>\n",
       "        <head>\n",
       "            <style>\n",
       "                .tokenized-text {\n",
       "    width:100%;\n",
       "    padding:2rem;\n",
       "    max-height: 400px;\n",
       "    overflow-y: auto;\n",
       "    box-sizing:border-box;\n",
       "    line-height:4rem; /* Lots of space between lines */\n",
       "    font-family: \"Roboto Light\", \"Ubuntu Light\", \"Ubuntu\", monospace;\n",
       "    box-shadow: 2px 2px 2px rgba(0,0,0,0.2);\n",
       "    background-color: rgba(0,0,0,0.01);\n",
       "    letter-spacing:2px; /* Give some extra separation between chars */\n",
       "}\n",
       ".non-token{\n",
       "    /* White space and other things the tokenizer ignores*/\n",
       "    white-space: pre;\n",
       "    letter-spacing:4px;\n",
       "    border-top:1px solid #A0A0A0; /* A gentle border on top and bottom makes tabs more ovious*/\n",
       "    border-bottom:1px solid #A0A0A0;\n",
       "    line-height: 1rem;\n",
       "    height: calc(100% - 2px);\n",
       "}\n",
       "\n",
       ".token {\n",
       "    white-space: pre;\n",
       "    position:relative;\n",
       "    color:black;\n",
       "    letter-spacing:2px;\n",
       "}\n",
       "\n",
       ".annotation{\n",
       "    white-space:nowrap; /* Important - ensures that annotations appears even if the annotated text wraps a line */\n",
       "    border-radius:4px;\n",
       "    position:relative;\n",
       "    width:fit-content;\n",
       "}\n",
       ".annotation:before {\n",
       "    /*The before holds the text and the after holds the background*/\n",
       "    z-index:1000; /* Make sure this is above the background */\n",
       "    content:attr(data-label); /* The annotations label is on a data attribute */\n",
       "    color:white;\n",
       "    position:absolute;\n",
       "    font-size:1rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    left:0;\n",
       "    width:100%;\n",
       "    padding:0.5rem 0;\n",
       "    /* These make it so an annotation doesn't stretch beyond the annotated text if the label is longer*/\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "    text-overflow:ellipsis;\n",
       "}\n",
       "\n",
       ".annotation:after {\n",
       "    content:attr(data-label); /* The content defines the width of the annotation*/\n",
       "    position:absolute;\n",
       "    font-size:0.75rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "    text-overflow:ellipsis;\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "\n",
       "    left:0;\n",
       "    width:100%; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
       "\n",
       "    padding:0.5rem 0;\n",
       "    /* Nast hack below:\n",
       "    We set the annotations color in code because we don't know the colors at css time.\n",
       "    But you can't pass a color as a data attribute to get it into the pseudo element (this thing)\n",
       "    So to get around that, annotations have the color set on them with a style attribute and then we\n",
       "    can get the color with currentColor.\n",
       "    Annotations wrap tokens and tokens set the color back to black\n",
       "     */\n",
       "    background-color: currentColor;\n",
       "}\n",
       ".annotation:hover::after, .annotation:hover::before{\n",
       "    /* When the user hovers over an annotation expand the label to display in full\n",
       "     */\n",
       "    min-width: fit-content;\n",
       "}\n",
       "\n",
       ".annotation:hover{\n",
       "    /* Emphasize the annotation start end with a border on hover*/\n",
       "    border-color: currentColor;\n",
       "    border: 2px solid;\n",
       "}\n",
       ".special-token:not(:empty){\n",
       "    /*\n",
       "    A none empty special token is like UNK (as opposed to CLS which has no representation in the text )\n",
       "     */\n",
       "    position:relative;\n",
       "}\n",
       ".special-token:empty::before{\n",
       "    /* Special tokens that don't have text are displayed as pseudo elements so we dont select them with the mouse*/\n",
       "    content:attr(data-stok);\n",
       "    background:#202020;\n",
       "    font-size:0.75rem;\n",
       "    color:white;\n",
       "    margin: 0 0.25rem;\n",
       "    padding: 0.25rem;\n",
       "    border-radius:4px\n",
       "}\n",
       "\n",
       ".special-token:not(:empty):before {\n",
       "    /* Special tokens that have text (UNK) are displayed above the actual text*/\n",
       "    content:attr(data-stok);\n",
       "    position:absolute;\n",
       "    bottom:1.75rem;\n",
       "    min-width:100%;\n",
       "    width:100%;\n",
       "    height:1rem;\n",
       "    line-height:1rem;\n",
       "    font-size:1rem;\n",
       "    text-align:center;\n",
       "    color:white;\n",
       "    font-weight:bold;\n",
       "    background:#202020;\n",
       "    border-radius:10%;\n",
       "}\n",
       "/*\n",
       "We want to alternate the color of tokens, but we can't use nth child because tokens might be broken up by annotations\n",
       "instead we apply even and odd class at generation time and color them that way\n",
       " */\n",
       ".even-token{\n",
       "    background:#DCDCDC\t;\n",
       "    border: 1px solid #DCDCDC;\n",
       "}\n",
       ".odd-token{\n",
       "    background:#A0A0A0;\n",
       "    border: 1px solid #A0A0A0;\n",
       "}\n",
       ".even-token.multi-token,.odd-token.multi-token{\n",
       "    background:  repeating-linear-gradient(\n",
       "    45deg,\n",
       "    transparent,\n",
       "    transparent 1px,\n",
       "    #ccc 1px,\n",
       "    #ccc 1px\n",
       "    ),\n",
       "    /* on \"bottom\" */\n",
       "    linear-gradient(\n",
       "    to bottom,\n",
       "    #FFB6C1,\n",
       "    #999\n",
       "    );\n",
       "}\n",
       "\n",
       ".multi-token:hover::after {\n",
       "    content:\"This char has more than 1 token\"; /* The content defines the width of the annotation*/\n",
       "    color:white;\n",
       "    background-color: black;\n",
       "    position:absolute;\n",
       "    font-size:0.75rem;\n",
       "    text-align:center;\n",
       "    font-weight:bold;\n",
       "    text-overflow:ellipsis;\n",
       "    top:1.75rem;\n",
       "    line-height:0;\n",
       "    overflow: hidden;\n",
       "    white-space: nowrap;\n",
       "    left:0;\n",
       "    width:fit-content; /* 100% of the parent, which is the annotation whose width is the tokens inside it*/\n",
       "    padding:0.5rem 0;\n",
       "}\n",
       "\n",
       "            </style>\n",
       "        </head>\n",
       "        <body>\n",
       "            <div class=\"tokenized-text\" dir=auto>\n",
       "            <span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\"  >Ma</span></span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token even-token\"  >th</span></span><span class=\"token odd-token\"  >ia</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token odd-token\"  >s</span><span class=\"token even-token\"  > </span></span><span class=\"token even-token\"  >B</span><span class=\"annotation\" style=\"color:hsl(214,32%,64%\" data-label=\"shoe\"><span class=\"token odd-token\"  >yn</span><span class=\"token even-token\"  >e</span></span><span class=\"token even-token\"  >ns</span><span class=\"token odd-token\"  > '</span><span class=\"token even-token\"  >Z</span><span class=\"token multi-token odd-token\"  >͑</span><span class=\"token multi-token odd-token\"  >ͫ</span><span class=\"token multi-token odd-token\"  >̓</span><span class=\"token multi-token odd-token\"  >ͪ</span><span class=\"token multi-token odd-token\"  >̂</span><span class=\"token multi-token odd-token\"  >ͫ</span><span class=\"annotation\" style=\"color:hsl(163,32%,64%\" data-label=\"random tandem bandem sandem landem fandom\"><span class=\"token multi-token odd-token\"  >̽</span><span class=\"token multi-token odd-token\"  >͏</span><span class=\"token multi-token odd-token\"  >̴</span><span class=\"token multi-token odd-token\"  >̙</span><span class=\"token multi-token odd-token\"  >̤</span><span class=\"token multi-token odd-token\"  >̞</span><span class=\"token multi-token odd-token\"  >͉</span></span><span class=\"token multi-token odd-token\"  >͚</span><span class=\"token multi-token odd-token\"  >̯</span><span class=\"token multi-token odd-token\"  >̞</span><span class=\"token multi-token odd-token\"  >̠</span><span class=\"token multi-token odd-token\"  >͍</span><span class=\"token odd-token\"  >A</span><span class=\"token multi-token even-token\"  >ͫ</span><span class=\"token multi-token even-token\"  >͗</span><span class=\"token multi-token even-token\"  >̴</span><span class=\"token multi-token even-token\"  >͢</span><span class=\"token multi-token even-token\"  >̵</span><span class=\"token multi-token even-token\"  >̜</span><span class=\"token multi-token even-token\"  >̰</span><span class=\"token multi-token even-token\"  >͔</span><span class=\"token even-token\"  >L</span><span class=\"token multi-token odd-token\"  >ͨ</span><span class=\"token multi-token odd-token\"  >ͧ</span><span class=\"token multi-token odd-token\"  >ͩ</span><span class=\"token multi-token odd-token\"  >͘</span><span class=\"token multi-token odd-token\"  >̠</span><span class=\"token odd-token\"  >G</span><span class=\"token multi-token even-token\"  >̑</span><span class=\"token multi-token even-token\"  >͗</span><span class=\"token multi-token even-token\"  >̎</span><span class=\"token multi-token even-token\"  >̅</span><span class=\"token multi-token even-token\"  >͛</span><span class=\"token multi-token even-token\"  >́</span><span class=\"token multi-token even-token\"  >̴</span><span class=\"token multi-token even-token\"  >̻</span><span class=\"token multi-token even-token\"  >͈</span><span class=\"token multi-token even-token\"  >͍</span><span class=\"token multi-token even-token\"  >͔</span><span class=\"token multi-token even-token\"  >̹</span><span class=\"annotation\" style=\"color:hsl(61,32%,64%\" data-label=\"foo\"><span class=\"token even-token\"  >O</span><span class=\"token multi-token odd-token\"  >͂</span><span class=\"token multi-token odd-token\"  >̌</span><span class=\"token multi-token odd-token\"  >̌</span><span class=\"token multi-token odd-token\"  >͘</span><span class=\"token multi-token odd-token\"  >̨</span><span class=\"token multi-token odd-token\"  >̵</span></span><span class=\"token multi-token odd-token\"  >̹</span><span class=\"token multi-token odd-token\"  >̻</span><span class=\"token multi-token odd-token\"  >̝</span><span class=\"token multi-token odd-token\"  >̳</span><span class=\"token odd-token\"  >!</span><span class=\"token multi-token even-token\"  >̿</span><span class=\"token multi-token even-token\"  >̋</span><span class=\"token multi-token even-token\"  >ͥ</span><span class=\"token multi-token even-token\"  >ͥ</span><span class=\"token multi-token even-token\"  >̂</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token multi-token even-token\"  >ͣ</span><span class=\"token multi-token even-token\"  >̐</span><span class=\"token multi-token even-token\"  >́</span><span class=\"token multi-token even-token\"  >́</span><span class=\"token multi-token even-token\"  >͞</span><span class=\"token multi-token even-token\"  >͜</span><span class=\"token multi-token even-token\"  >͖</span><span class=\"token multi-token even-token\"  >̬</span><span class=\"token multi-token even-token\"  >̰</span><span class=\"token multi-token even-token\"  >̙</span><span class=\"token multi-token even-token\"  >̗</span><span class=\"token even-token\"  >':</span><span class=\"token odd-token\"  > W</span></span><span class=\"token odd-token\"  >henever</span><span class=\"token even-token\"  > you</span><span class=\"token multi-token odd-token\"  >’</span><span class=\"token odd-token\"  >re</span><span class=\"token even-token\"  > working</span><span class=\"token odd-token\"  > on</span><span class=\"annotation\" style=\"color:hsl(10,32%,64%\" data-label=\"bar\"><span class=\"token even-token\"  > a</span><span class=\"token odd-token\"  > piece</span></span><span class=\"token even-token\"  > of</span><span class=\"token odd-token\"  > JavaScript</span><span class=\"token even-token\"  > code</span><span class=\"token odd-token\"  > that</span><span class=\"annotation\" style=\"color:hsl(112,32%,64%\" data-label=\"poo\"><span class=\"token even-token\"  > de</span></span><span class=\"token even-token\"  >als</span><span class=\"token odd-token\"  > with</span><span class=\"token even-token\"  > strings</span><span class=\"token odd-token\"  > or</span><span class=\"token even-token\"  > regular</span><span class=\"token odd-token\"  > expressions</span><span class=\"token even-token\"  > in</span><span class=\"token odd-token\"  > some</span><span class=\"token even-token\"  > way</span><span class=\"token odd-token\"  >,</span><span class=\"token even-token\"  > just</span><span class=\"token odd-token\"  > add</span><span class=\"token even-token\"  > a</span><span class=\"token odd-token\"  > unit</span><span class=\"token even-token\"  > test</span><span class=\"token odd-token\"  > that</span><span class=\"token even-token\"  > contains</span><span class=\"token odd-token\"  > a</span><span class=\"token even-token\"  > pile</span><span class=\"token odd-token\"  > of</span><span class=\"token even-token\"  > po</span><span class=\"token odd-token\"  >o</span><span class=\"token even-token\"  > (</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token even-token\"  >)</span><span class=\"token odd-token\"  > in</span><span class=\"token even-token\"  > a</span><span class=\"token odd-token\"  > string</span><span class=\"token even-token\"  >,</span><span class=\"token odd-token\"  > 💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token multi-token odd-token\"  >💩</span><span class=\"token multi-token even-token\"  >💩</span><span class=\"token odd-token\"  > and</span><span class=\"token even-token\"  > see</span><span class=\"token odd-token\"  > if</span><span class=\"token even-token\"  > anything</span><span class=\"token odd-token\"  > breaks</span><span class=\"token even-token\"  >.</span><span class=\"token odd-token\"  > It</span><span class=\"token multi-token even-token\"  >’</span><span class=\"token even-token\"  >s</span><span class=\"token odd-token\"  > a</span><span class=\"token even-token\"  > quick</span><span class=\"token odd-token\"  >,</span><span class=\"token even-token\"  > fun</span><span class=\"token odd-token\"  >,</span><span class=\"token even-token\"  > and</span><span class=\"token odd-token\"  > easy</span><span class=\"token even-token\"  > way</span><span class=\"token odd-token\"  > to</span><span class=\"token even-token\"  > see</span><span class=\"token odd-token\"  > if</span><span class=\"token even-token\"  > your</span><span class=\"token odd-token\"  > code</span><span class=\"token even-token\"  > supports</span><span class=\"token odd-token\"  > ast</span><span class=\"token even-token\"  >ral</span><span class=\"token odd-token\"  > symbols</span><span class=\"token even-token\"  >.</span><span class=\"token odd-token\"  > Once</span><span class=\"token even-token\"  > you</span><span class=\"token multi-token odd-token\"  >’</span><span class=\"token odd-token\"  >ve</span><span class=\"token even-token\"  > found</span><span class=\"token odd-token\"  > a</span><span class=\"token even-token\"  > Unicode</span><span class=\"token odd-token\"  >-</span><span class=\"token even-token\"  >related</span><span class=\"token odd-token\"  > bug</span><span class=\"token even-token\"  > in</span><span class=\"token odd-token\"  > your</span><span class=\"token even-token\"  > code</span><span class=\"token odd-token\"  >,</span><span class=\"token even-token\"  > all</span><span class=\"token odd-token\"  > you</span><span class=\"token even-token\"  > need</span><span class=\"token odd-token\"  > to</span><span class=\"token even-token\"  > do</span><span class=\"token odd-token\"  > is</span><span class=\"token even-token\"  > apply</span><span class=\"token odd-token\"  > the</span><span class=\"token even-token\"  > techniques</span><span class=\"token odd-token\"  > discussed</span><span class=\"token even-token\"  > in</span><span class=\"token odd-token\"  > this</span><span class=\"token even-token\"  > post</span><span class=\"token odd-token\"  > to</span><span class=\"token even-token\"  > fix</span><span class=\"token odd-token\"  > it</span><span class=\"token even-token\"  >.</span>\n",
       "            </div>\n",
       "        </body>\n",
       "    </html>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from tokenizers import ByteLevelBPETokenizer\n",
    "roberta_tokenizer = ByteLevelBPETokenizer.from_file('/tmp/roberta-base-vocab.json', '/tmp/roberta-base-merges.txt')\n",
    "roberta_visualizer = EncodingVisualizer(tokenizer=roberta_tokenizer, default_to_notebook=True)\n",
    "roberta_visualizer(text, annotations=annotations)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
